<?php

require './vendor/owner888/phpspider/autoloader.php';
use phpspider\core\phpspider;
use phpspider\core\db;

//下面这个注释删了就跑不起来.....
/* Do NOT delete this comment */
/* 不要删除这段注释 */
$fields_one=[
    [
        'name' => "keywords",//关键词
        'selector' => "//meta[@name='keywords']/@content",
        'required' =>false,
    ],
    [
        'name' => "description",//描述
        'selector' => "//meta[@name='description']/@content",
        'required' =>false,
    ],
    [
        'name' => "cate_name",//栏目
        'selector' => "//div[@class='bread']/a[last()-1]",
        'required' => true,
    ],
    [
        'name' => "title",//标题
        'selector' => "//div[@class='arc_title']//h1",
        'required' => true,
    ],
    [
        'name' => "time",//时间
        'selector' => "//div[@class='arc_title']//span[1]",
        'required' => true,
    ],
    [
        'name' => "content",//内容
        'selector' => "//div[@class='arc_content']",
        'required' => true,
    ],
];

$fields_two=[
    [
        'name' => "keywords",//关键词
        'selector' => "//meta[@name='keywords']/@content",
        'required' =>false,
    ],
    [
        'name' => "description",//描述
        'selector' => "//meta[@name='description']/@content",
        'required' =>false,
    ],
    [
        'name' => "cate_name",//栏目
        'selector' => "//div[@class='crumbs_wrap']//a[last()]",
        'required' => true,
    ],
    [
        'name' => "title",//标题
        'selector' => "//div[@class='container']//h1",
        'required' => true,
    ],
    [
        'name' => "time",//时间
        'selector' => "//div[@class='container']//span[@class='time']",
        'required' => true,
    ],
    [
        'name' => "content",//内容
        'selector' => "//div[@class='container_text']",
        'required' => true,
    ],
];

$configs = array(
    'name' => '特玩',
    'log_show' =>false,
    'log_file' =>'tewan.log',
    'log_type' => 'warn,error,debug',
    'tasknum' => 3,//爬虫任务数
    //'save_running_state' => true,
    //主域名
    'domains' =>[
        'www.te5.com',
    ],
    //入口地址
    'scan_urls' => [
        //最新地址
        'http://www.te5.com/news/list_186_1.html',
        //攻略汇总
        //'http://www.te5.com/news/shouyouzx/',
        //游戏新闻
        //'http://www.te5.com/news/youxixinwen/',
        //热门游戏--先只采集这一个栏目列表下的数据
        //'http://www.te5.com/news/danji/'

    ],
    //列表url 匹配规则
    'list_url_regexes' => [
        //"/news/(.*)/list_\d+_\d+.html"
    ],
    //详情页url 匹配规则
    'content_url_regexes' =>[
        "/news/\d+.html",
        "/news/\d+/\d+.html"
    ],
    'db_config' => [
        'host'  => '127.0.0.1',
        'port'  => 3306,
        'user'  => 'youxi',
        'pass'  => '708587132d',
        'name'  => 'youxi',
    ],
    'fields' => $fields_one,//可以更换为fields_one,更换规则再采集一次

);



//实例化蜘蛛
$spider=new phpspider($configs);

//采集初始=>数据库初始化
$spider->on_start = function($phpspider)
{

    $url="http://www.te5.com/news/list_186_1.html";
    $phpspider->add_url($url);


    // 数据库连接
    $db_config = $phpspider->get_config("db_config");
    db::set_connect('default', $db_config);
    db::_init();
};

//详情页=>提取字段=>处理字段
$spider->on_extract_field = function($fieldname, $data, $page)
{
    if($fieldname=='content'){
        $content=json_encode($data,JSON_UNESCAPED_UNICODE);//转成json
        $content_gz=gzcompress($content);//压缩字符串
        $data=base64_encode($content_gz);//组成base64
    }

    return $data;
};

//详情页=>提取最终的数据=>入库
$spider->on_extract_page = function($page, $data)
{
    $savedata =[];

    $savedata["keywords"]=strip_tags($data["keywords"]);//关键词
    $savedata["description"]=strip_tags($data["description"]);//描述
    $savedata["cate_name"]=strip_tags($data["cate_name"]);//栏目名称
    $savedata["title"]=strip_tags($data["title"]);//标题
    $savedata['create_time'] =$data["time"];//发布时间
    $savedata["content"]=$data['content'];//内容
    $savedata['status'] = 1;//状态

    //判断标题里面是否有不良关键词
    $kwds1="传奇";
    $kwds2="sf";
    $has_kwd=false;
    $str=strip_tags($savedata['title']);
    if(strpos($str,$kwds1)!==false){
        $has_kwd=true;
    }
    if(strpos($str,$kwds2)!==false){
        $has_kwd=true;
    }

    //如果采集的数据content字段不为空,直接插入数据库
    if($savedata['content']!==''&&$has_kwd===false){

        $sql = "SELECT id from yx_news WHERE title='".$savedata['title']."';";
        $row = db::get_one($sql);
        if(!$row){
            db::insert("yx_news", $savedata);//数据库保存
            echo '数据库已保存!';
        }else{
            echo '数据重复,已跳过...';
        }

    }

    return $data;
};

$spider->on_list_page = function($page, $content, $phpspider)
{
    echo '采集列表页:'.var_dump($page);
};


$spider->start();

